Generating matrix of the rsem estimated_counts

# Loading results

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
setwd("C:\\Users\\mapanavici\\RStudio\\R")

Ov4Carbopool3 <- read.delim("WTCHG_626197_201106_.genes.results", header = TRUE)

Ovcar4Parental1 <- read.delim("WTCHG_626197_289105_.genes.results", header = TRUE)

Ovcar4Parental2 <- read.delim("WTCHG_626197_290117_.genes.results", header = TRUE)

Ovcar4Parental3 <- read.delim("WTCHG_626197_291129_.genes.results", header = TRUE)

Ov4Carbopool1 <- read.delim("WTCHG_626197_295177_.genes.results", header = TRUE)
 
Ov4Carbopool2 <- read.delim("WTCHG_626197_296189_.genes.results", header = TRUE)
# Obtaining "gene_id" and "expected_count" columns, renaming "expected_count" column to individual sample.names

get_counts <- function(sample_df) {
  
  id_and_counts <- select(sample_df, gene_id, expected_count)
  names(id_and_counts)[2] <- deparse(substitute(sample_df))
  
  return(id_and_counts)
}

df1 <- get_counts(Ovcar4Parental1)
df2 <- get_counts(Ovcar4Parental2)
df3 <- get_counts(Ovcar4Parental3)
df4 <- get_counts(Ov4Carbopool1)
df5 <- get_counts(Ov4Carbopool2)
df6 <- get_counts(Ov4Carbopool3)
# Merging all data sets by "gene_id" column 

rawCounts <- list(df1, df2, df3, df4, df5 ,df6) %>% 
                reduce(inner_join, by='gene_id')

Differential Expression Analysis

if (!require("BiocManager", quietly = TRUE)) 
  install.packages("BiocManager") 
## Warning: package 'BiocManager' was built under R version 4.2.2
## Bioconductor version '3.15' is out-of-date; the current release version '3.16'
##   is available with R version '4.2'; see https://bioconductor.org/install
  BiocManager::install("DESeq2")
## Bioconductor version 3.15 (BiocManager 1.30.19), R 4.2.1 (2022-06-23 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'DESeq2'
## Old packages: 'bookdown', 'broom', 'bslib', 'cli', 'digest', 'doRNG',
##   'evaluate', 'formatR', 'highr', 'htmlwidgets', 'httpuv', 'isoband',
##   'maptools', 'nlme', 'purrr', 'rmarkdown', 'RSQLite', 'shiny', 'tidytree',
##   'tinytex', 'vctrs', 'xfun', 'yulab.utils'
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 4.2.2
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
# Tidying up the data for DESeq2

rownames(rawCounts) <- rawCounts$gene_id

rawCounts <-  select(rawCounts, -1)

rawCounts <- ceiling(rawCounts) # converting floats to int for DESeq2

head(rawCounts)
# Inputting - Sample Groups

sampleData <- read.delim("sample_IDs.csv", header = T)

sampleData <- sampleData %>% 
      separate(sample.name.group, c("sample.name", "group"))

head(sampleData)
# Creating DESeq2 object

dds <- DESeqDataSetFromMatrix(countData = rawCounts,
                              colData = sampleData,
                              design = ~ group)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds
## class: DESeqDataSet 
## dim: 61541 6 
## metadata(1): version
## assays(1): counts
## rownames(61541): ENSG00000000003 ENSG00000000005 ... ENSG00000289643
##   ENSG00000289644
## rowData names(0):
## colnames(6): Ovcar4Parental1 Ovcar4Parental2 ... Ov4Carbopool2
##   Ov4Carbopool3
## colData names(2): sample.name group
# Filtering out low count genes - allowing 10 counts in at least 3 samples

keep <- rowSums(counts(dds) >= 10) >= 3

dds <- dds[keep,]
# Running DESeq2 algo

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Using unsupervised clustering to see the relationship among two groups

# Data normalization

norm.counts <- counts(dds, normalized=TRUE)

write.csv(norm.counts, file="normalized_counts.txt", row.names=TRUE)

head(norm.counts)
##                 Ovcar4Parental1 Ovcar4Parental2 Ovcar4Parental3 Ov4Carbopool1
## ENSG00000000003        868.5490       1010.9747        987.0680      908.6342
## ENSG00000000419       1679.9055       1569.9057       1611.2002     1748.8173
## ENSG00000000457        152.1899        139.1760        164.3483      191.0009
## ENSG00000000460        485.6507        602.3539        689.6758      442.7246
## ENSG00000000971        491.4669        950.8507        805.1109      228.5386
## ENSG00000001036       1343.5367       1475.2660       1338.2646     1055.4731
##                 Ov4Carbopool2 Ov4Carbopool3
## ENSG00000000003      776.0136      881.1733
## ENSG00000000419     2344.5334     2164.6640
## ENSG00000000457      244.7828      188.4023
## ENSG00000000460      471.3371      553.4318
## ENSG00000000971      366.3062      877.2482
## ENSG00000001036     1120.6192     1533.7125
# Data transformation (using variance stabilizing transformations (VST) which produces transformed data on the log2 scale, normalized with respect to library size)

vsd <- vst(dds, blind=FALSE)

head(assay(vsd))
##                 Ovcar4Parental1 Ovcar4Parental2 Ovcar4Parental3 Ov4Carbopool1
## ENSG00000000003        9.929855       10.126994       10.095738      9.988132
## ENSG00000000419       10.804267       10.712576       10.747685     10.858869
## ENSG00000000457        7.966106        7.884723        8.037823      8.182649
## ENSG00000000460        9.205757        9.467683        9.636318      9.095802
## ENSG00000000971        9.220019       10.047051        9.832493      8.363318
## ENSG00000001036       10.503236       10.628728       10.497979     10.183420
##                 Ov4Carbopool2 Ov4Carbopool3
## ENSG00000000003      9.785521      9.948465
## ENSG00000000419     11.260481     11.150564
## ENSG00000000457      8.434630      8.169200
## ENSG00000000460      9.170033      9.363690
## ENSG00000000971      8.875983      9.942705
## ENSG00000001036     10.262225     10.681084
# Hierarchical clustering (dendrogram)

sampleDists <- dist(t(assay(vsd)))
plot(hclust(sampleDists, method = "complete"))

# PCA

plotPCA(vsd, intgroup=c("group"))

# Getting the results of differential expression analysis

res <- results(dds, contrast=c("group", "sensitive", "resistant"))

# Viewing summary of results

summary(res)
## 
## out of 15495 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2244, 14%
## LFC < 0 (down)     : 2024, 13%
## outliers [1]       : 7, 0.045%
## low counts [2]     : 0, 0%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#head(res)
#mcols(res)$description
# Diagnostics - observing how well the data has been captured by the model (plotting Dispersion Estimates)

plotDispEsts(dds)

# Graphical Visualizations of the DE genes – MA plot (shows the log2 fold changes of the genes (each dot is a gene) over the mean of normalized counts for all samples in our data.)

plotMA(res)

Positive log2 fold changes values represent upregulated genes in sensitive compared to resistant group Negative log2 fold changes values represent dowregulated genes in sensitive compared to resistant group

# Graphical Visualizations of the differentially expressed genes, subseting the DE genes with padj < 0.05 and log2FC > |1| i.e. at least two-fold change in expression

library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.2.2
de.sign <- subset(res, padj < 0.05 & abs(log2FoldChange) >1 )

de.sign.genes <- rownames(de.sign)

scale.dat <- t(scale(t(assay(vsd)[de.sign.genes,])))

pheatmap(scale.dat[de.sign.genes,], cluster_rows=TRUE, show_rownames=FALSE, cluster_cols=TRUE)

# Annotation of output tables 

BiocManager::install('AnnotationDbi')
## Bioconductor version 3.15 (BiocManager 1.30.19), R 4.2.1 (2022-06-23 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'AnnotationDbi'
## Old packages: 'bookdown', 'broom', 'bslib', 'cli', 'digest', 'doRNG',
##   'evaluate', 'formatR', 'highr', 'htmlwidgets', 'httpuv', 'isoband',
##   'maptools', 'nlme', 'purrr', 'rmarkdown', 'RSQLite', 'shiny', 'tidytree',
##   'tinytex', 'vctrs', 'xfun', 'yulab.utils'
library(AnnotationDbi)
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
library(org.Hs.eg.db)
## 
res.df <- as.data.frame(res)

res.annot <- mapIds(org.Hs.eg.db, keys = rownames(res.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
res.df.annot <- cbind(res.df,gene.symbol = res.annot) #binding the annotations to the sign.df

head(res.df.annot)
write.csv(res.df.annot, file = "DE_sensitiveVSresistant_results_wGeneSymbol.csv")
# Using volcano plot to visualize the Log2Fold changes of all genes (again each dot represents a gene) plotted against the -Log10 Adjusted pvalue.
# In red are genes with padj < 0.05 and log2fold change more or less than 1. They are either differently upregulated (right) or downregulated (left) in sensitive samples compare to resistant.

BiocManager::install('EnhancedVolcano')
## Bioconductor version 3.15 (BiocManager 1.30.19), R 4.2.1 (2022-06-23 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'EnhancedVolcano'
## Old packages: 'bookdown', 'broom', 'bslib', 'cli', 'digest', 'doRNG',
##   'evaluate', 'formatR', 'highr', 'htmlwidgets', 'httpuv', 'isoband',
##   'maptools', 'nlme', 'purrr', 'rmarkdown', 'RSQLite', 'shiny', 'tidytree',
##   'tinytex', 'vctrs', 'xfun', 'yulab.utils'
library(EnhancedVolcano)
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.2.2
EnhancedVolcano(res.df.annot,
    lab = res.df.annot$gene.symbol,
    x = 'log2FoldChange',
    y = 'padj')

Gene set enrichment analysis

# Performing Pre-Ranked Gene Set Enrichment Analysis (GSEA Pre-Ranked) to understand better what biological processes and pathways are affected 

library(fgsea)

library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     shift
## The following object is masked from 'package:GenomicRanges':
## 
##     shift
## The following object is masked from 'package:IRanges':
## 
##     shift
## The following objects are masked from 'package:S4Vectors':
## 
##     first, second
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggplot2)

library(qusage)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
# Loading DE results

DE.res <- read.csv("DE_sensitiveVSresistant_results_wGeneSymbol.csv", header = TRUE, row.names = 1)
DE.res
# Loading the Gene Ontology Collection

gmt.file <- read.gmt("c5.go.bp.v7.4.symbols.gmt")
# Ranking DE results by decreasing Log2 Fold-change and then creating a named vector of the log2 fold changes with the names being the gene symbols.

DE.res.ranked <- DE.res[order(DE.res$log2FoldChange, decreasing = T), ]

DE.ranks <- setNames(DE.res.ranked$log2FoldChange, DE.res.ranked$gene.symbol)
# Running the fgsea function

fgseaRes <- fgsea(gmt.file, DE.ranks, minSize=15, maxSize=500)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
head(fgseaRes)
# Enrichment Plot

topPathways <- fgseaRes[order(padj)]

topPathways
res
## log2 fold change (MLE): group sensitive vs resistant 
## Wald test p-value: group sensitive vs resistant 
## DataFrame with 15495 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003   905.402       0.160219  0.163600  0.979334 0.3274152
## ENSG00000000419  1853.171      -0.364759  0.170455 -2.139908 0.0323622
## ENSG00000000457   179.983      -0.454814  0.229966 -1.977749 0.0479570
## ENSG00000000460   540.862       0.276303  0.206161  1.340228 0.1801711
## ENSG00000000971   619.920       0.609931  0.422748  1.442776 0.1490835
## ...                   ...            ...       ...       ...       ...
## ENSG00000289604  335.8182       0.429061  0.269354   1.59292 0.1111771
## ENSG00000289609  189.7031       0.967793  0.498021   1.94328 0.0519826
## ENSG00000289613   24.0473      -0.546796  0.477237  -1.14575 0.2518966
## ENSG00000289614   45.3963      -0.407616  0.394871  -1.03227 0.3019434
## ENSG00000289626   11.8163      -0.933826  0.714042  -1.30780 0.1909404
##                      padj
##                 <numeric>
## ENSG00000000003  0.536898
## ENSG00000000419  0.111981
## ENSG00000000457  0.148879
## ENSG00000000460  0.369557
## ENSG00000000971  0.326394
## ...                   ...
## ENSG00000289604  0.266715
## ENSG00000289609  0.157709
## ENSG00000289613  0.454918
## ENSG00000289614  0.509589
## ENSG00000289626  0.383416

Pathway Enrichment Analysis

# selecting sign. DE genes

library("AnnotationDbi")
library("org.Hs.eg.db")


# adding symbol column for further pathway analysis

res$symbol = mapIds(org.Hs.eg.db,
                    keys=row.names(res), 
                    column="SYMBOL",
                    keytype="ENSEMBL",
                    multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
sig_genes <- res[res$padj <= 0.05 & !is.na(res$padj), "symbol"]

print(paste("Total number of significant genes:", length(sig_genes)))
## [1] "Total number of significant genes: 3442"
write.table(sig_genes, file="significant_genes.txt", row.names=FALSE, col.names=FALSE, quote=FALSE)

Performing pathway analysis online where I uploaded “significant_genes.txt” to Reactome website (https://reactome.org/PathwayBrowser/#TOOL=AT). Selectied parameters:“Project to Humans”

reactome_res <- read.csv("reactome_result.csv")
reactome_res
# top dysregulated pathways

reactome_res %>% dplyr::select(2, 6)

Gene Ontology Biological Pathways (GOBP) related to the tumor micro-environment, particularly the extracellular matrix, were enriched in carboplatin-resistant cells